{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.dataframe as ddf\n",
    "import time\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = ddf.read_csv('random_people.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfo.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['bonus'] = df['salary']*.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfo = pd.read_csv('random_people.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfo['bonus'] = dfo['salary']*.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def benchmark(function, function_name):\n",
    "    start = time.time()\n",
    "    function()\n",
    "    end = time.time()\n",
    "    print(\"{0} seconds for {1}\".format((end - start), function_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_bonus(df):\n",
    "    df['bonus'] = df['salary']*.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_1():\n",
    "    get_bonus(df)\n",
    "def test_2():\n",
    "    get_bonus(dfo)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "benchmark(test_1, 'dataframe nuevo')\n",
    "benchmark(test_2, 'dataframe viejo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.concat([dfo for _ in range(1000)])\n",
    "\n",
    "df3 = pd.concat([df2 for _ in range(500)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfn = ddf.from_pandas(df3, npartitions=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del dfn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_big():\n",
    "    get_bonus(dfn)\n",
    "def test_big_old():\n",
    "    get_bonus(df3)\n",
    "\n",
    "def get_big_mean():\n",
    "    return dfn.salary.mean().compute()\n",
    "def get_big_mean_old():\n",
    "    return df3.salary.mean()\n",
    "\n",
    "def get_big_max():\n",
    "    return dfn.salary.max().compute()\n",
    "def get_big_max_old():\n",
    "    return df3.salary.max()\n",
    "\n",
    "def get_big_sum():\n",
    "    return dfn.salary.sum().compute()\n",
    "def get_big_sum_old():\n",
    "    return df3.salary.sum()\n",
    "\n",
    "def filter_df():\n",
    "    df = dfn[dfn['salary']>5000]\n",
    "def filter_df_old():\n",
    "    df = df3[df3['salary']>5000]\n",
    "    \n",
    "def run_benchmarks():\n",
    "    for i,f in enumerate([test_big, #test_big_old,\n",
    "                          get_big_mean,# get_big_mean_old,\n",
    "                          get_big_max, #get_big_max_old,\n",
    "                          get_big_sum, #get_big_sum_old,\n",
    "                          filter_df,#filter_df_old\n",
    "                         ]):\n",
    "        benchmark(f, f.__name__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    return (13*x+5)%7\n",
    "\n",
    "def apply_random_old():\n",
    "    df3['random']= df3['salary'].apply(f)\n",
    "    \n",
    "def apply_random():\n",
    "    dfn['random']= dfn['salary'].apply(f).compute()\n",
    "\n",
    "def value_count_test():\n",
    "    dfn.salary.value_counts().compute()\n",
    "\n",
    "def value_count_test_old():\n",
    "    df3.salary.value_counts()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_benchmarks()\n",
    "benchmark(apply_random, apply_random.__name__)\n",
    "#benchmark(apply_random_old, apply_random_old.__name__)\n",
    "benchmark(value_count_test, value_count_test.__name__)\n",
    "#benchmark(value_count_test_old, value_count_test_old.__name__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfn.salary.value_counts().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfn.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
